// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/InputStreamSource.java,v $
// $Author: derrickoswald $
// $Date: 2005/10/25 01:26:09 $
// $Revision: 1.9 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.lexer;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.UnsupportedEncodingException;

import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.ParserException;

/**
 * A source of characters based on an InputStream such as from a URLConnection.
 */
public class InputStreamSource extends Source
{
	/**
	 * An initial buffer size. Has a default value of {16384}.
	 */
	public static int BUFFER_SIZE = 16384;

	/**
	 * The stream of bytes. Set to <code>null</code> when the source is closed.
	 */
	protected transient InputStream mStream;

	/**
	 * The character set in use.
	 */
	protected String mEncoding;

	/**
	 * The converter from bytes to characters.
	 */
	protected transient InputStreamReader mReader;

	/**
	 * The characters read so far.
	 */
	protected char[] mBuffer;

	/**
	 * The number of valid bytes in the buffer.
	 */
	protected int mLevel;

	/**
	 * The offset of the next byte returned by read().
	 */
	protected int mOffset;

	/**
	 * The bookmark.
	 */
	protected int mMark;

	/**
	 * Create a source of characters using the default character set.
	 * 
	 * @param stream
	 *            The stream of bytes to use.
	 * @exception UnsupportedEncodingException
	 *                If the default character set is unsupported.
	 */
	public InputStreamSource(InputStream stream) throws UnsupportedEncodingException
	{
		this(stream, null, BUFFER_SIZE);
	}

	/**
	 * Create a source of characters.
	 * 
	 * @param stream
	 *            The stream of bytes to use.
	 * @param charset
	 *            The character set used in encoding the stream.
	 * @exception UnsupportedEncodingException
	 *                If the character set is unsupported.
	 */
	public InputStreamSource(InputStream stream, String charset)
			throws UnsupportedEncodingException
	{
		this(stream, charset, BUFFER_SIZE);
	}

	/**
	 * Create a source of characters.
	 * 
	 * @param stream
	 *            The stream of bytes to use.
	 * @param charset
	 *            The character set used in encoding the stream.
	 * @param size
	 *            The initial character buffer size.
	 * @exception UnsupportedEncodingException
	 *                If the character set is unsupported.
	 */
	public InputStreamSource(InputStream stream, String charset, int size)
			throws UnsupportedEncodingException
	{
		if (null == stream)
			stream = new Stream(null);
		else
		// bug #1044707 mark()/reset() issues
		if (!stream.markSupported())
		// wrap the stream so we can reset
			stream = new Stream(stream);
		// else
		// just because mark is supported doesn't guarantee
		// proper reset operation; there is no call to mark
		// in this code, so if reset misbehaves there is an
		// appropriate message in setEncoding() to suggest
		// wraping it in a Stream.
		// This was deemed better than an attempt to call
		// reset at this point just to check if we would
		// succeed later, or to call mark with an arbitrary
		// lookahead size
		mStream = stream;
		if (null == charset)
		{
			mReader = new InputStreamReader(stream);
			mEncoding = mReader.getEncoding();
		}
		else
		{
			mEncoding = charset;
			mReader = new InputStreamReader(stream, charset);
		}
		mBuffer = new char[size];
		mLevel = 0;
		mOffset = 0;
		mMark = -1;
	}

	//
	// Serialization support
	//

	/**
	 * Serialization support.
	 * 
	 * @param out
	 *            Where to write this object.
	 * @exception IOException
	 *                If serialization has a problem.
	 */
	private void writeObject(ObjectOutputStream out) throws IOException
	{
		int offset;
		char[] buffer;

		if (null != mStream)
		{
			// remember the offset, drain the input stream, restore the offset
			offset = mOffset;
			buffer = new char[4096];
			while (EOF != read(buffer))
				;
			mOffset = offset;
		}

		out.defaultWriteObject();
	}

	/**
	 * Deserialization support.
	 * 
	 * @param in
	 *            Where to read this object from.
	 * @exception IOException
	 *                If deserialization has a problem.
	 */
	private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException
	{
		in.defaultReadObject();
		if (null != mBuffer) // buffer is null when destroy's been called
			// pretend we're open, mStream goes null when exhausted
			mStream = new ByteArrayInputStream(new byte[0]);
	}

	/**
	 * Get the input stream being used.
	 * 
	 * @return The current input stream.
	 */
	public InputStream getStream()
	{
		return (mStream);
	}

	/**
	 * Get the encoding being used to convert characters.
	 * 
	 * @return The current encoding.
	 */
	public String getEncoding()
	{
		return (mEncoding);
	}

	/**
	 * Begins reading from the source with the given character set. If the
	 * current encoding is the same as the requested encoding, this method is a
	 * no-op. Otherwise any subsequent characters read from this page will have
	 * been decoded using the given character set.
	 * <p>
	 * Some magic happens here to obtain this result if characters have already
	 * been consumed from this source. Since a Reader cannot be dynamically
	 * altered to use a different character set, the underlying stream is reset,
	 * a new Source is constructed and a comparison made of the characters read
	 * so far with the newly read characters up to the current position. If a
	 * difference is encountered, or some other problem occurs, an exception is
	 * thrown.
	 * 
	 * @param character_set
	 *            The character set to use to convert bytes into characters.
	 * @exception ParserException
	 *                If a character mismatch occurs between characters already
	 *                provided and those that would have been returned had the
	 *                new character set been in effect from the beginning. An
	 *                exception is also thrown if the underlying stream won't
	 *                put up with these shenanigans.
	 */
	public void setEncoding(String character_set) throws ParserException
	{
		String encoding;
		InputStream stream;
		char[] buffer;
		int offset;
		char[] new_chars;

		encoding = getEncoding();
		if (!encoding.equalsIgnoreCase(character_set))
		{
			stream = getStream();
			try
			{
				buffer = mBuffer;
				offset = mOffset;
				stream.reset();
				try
				{
					mEncoding = character_set;
					mReader = new InputStreamReader(stream, character_set);
					mBuffer = new char[mBuffer.length];
					mLevel = 0;
					mOffset = 0;
					mMark = -1;
					if (0 != offset)
					{
						new_chars = new char[offset];
						if (offset != read(new_chars))
							throw new ParserException("reset stream failed");
						for (int i = 0; i < offset; i++)
							if (new_chars[i] != buffer[i])
								throw new EncodingChangeException("character mismatch (new: "
										+ new_chars[i] + " [0x"
										+ Integer.toString(new_chars[i], 16) + "] != old: "
										+ " [0x" + Integer.toString(buffer[i], 16) + buffer[i]
										+ "]) for encoding change from " + encoding + " to "
										+ character_set + " at character offset " + i);
					}
				}
				catch (IOException ioe)
				{
					throw new ParserException(ioe.getMessage(), ioe);
				}
			}
			catch (IOException ioe)
			{ // bug #1044707 mark()/reset() issues
				throw new ParserException("Stream reset failed (" + ioe.getMessage()
						+ "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);
			}
		}
	}

	/**
	 * Fetch more characters from the underlying reader. Has no effect if the
	 * underlying reader has been drained.
	 * 
	 * @param min
	 *            The minimum to read.
	 * @exception IOException
	 *                If the underlying reader read() throws one.
	 */
	protected void fill(int min) throws IOException
	{
		char[] buffer;
		int size;
		int read;

		if (null != mReader) // mReader goes null when it's been sucked dry
		{
			size = mBuffer.length - mLevel; // available space
			if (size < min) // oops, better get some buffer space
			{
				// unknown length... keep doubling
				size = mBuffer.length * 2;
				read = mLevel + min;
				if (size < read) // or satisfy min, whichever is greater
					size = read;
				else
					min = size - mLevel; // read the max
				buffer = new char[size];
			}
			else
			{
				buffer = mBuffer;
				min = size;
			}

			// read into the end of the 'new' buffer
			read = mReader.read(buffer, mLevel, min);
			if (EOF == read)
			{
				mReader.close();
				mReader = null;
			}
			else
			{
				if (mBuffer != buffer)
				{ // copy the bytes previously read
					System.arraycopy(mBuffer, 0, buffer, 0, mLevel);
					mBuffer = buffer;
				}
				mLevel += read;
			}
			// todo, should repeat on read shorter than original min
		}
	}

	//
	// Reader overrides
	//

	/**
	 * Does nothing. It's supposed to close the source, but use destroy()
	 * instead.
	 * 
	 * @exception IOException
	 *                <em>not used</em>
	 * @see #destroy
	 */
	public void close() throws IOException
	{
	}

	/**
	 * Read a single character. This method will block until a character is
	 * available, an I/O error occurs, or the end of the stream is reached.
	 * 
	 * @return The character read, as an integer in the range 0 to 65535 (
	 *         <tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the
	 *         stream has been reached
	 * @exception IOException
	 *                If an I/O error occurs.
	 */
	public int read() throws IOException
	{
		int ret;

		if (mLevel - mOffset < 1)
		{
			if (null == mStream) throw new IOException("source is closed");
			fill(1);
			if (mOffset >= mLevel)
				ret = EOF;
			else
				ret = mBuffer[mOffset++];
		}
		else
			ret = mBuffer[mOffset++];

		return (ret);
	}

	/**
	 * Read characters into a portion of an array. This method will block until
	 * some input is available, an I/O error occurs, or the end of the stream is
	 * reached.
	 * 
	 * @param cbuf
	 *            Destination buffer
	 * @param off
	 *            Offset at which to start storing characters
	 * @param len
	 *            Maximum number of characters to read
	 * @return The number of characters read, or {@link #EOF EOF} if the end of
	 *         the stream has been reached
	 * @exception IOException
	 *                If an I/O error occurs.
	 */
	public int read(char[] cbuf, int off, int len) throws IOException
	{
		int ret;

		if (null == mStream) throw new IOException("source is closed");
		if ((null == cbuf) || (0 > off) || (0 > len))
			throw new IOException("illegal argument read (" + ((null == cbuf) ? "null" : "cbuf")
					+ ", " + off + ", " + len + ")");
		if (mLevel - mOffset < len) fill(len - (mLevel - mOffset)); // minimum
																	// to
																	// satisfy
																	// this
																	// request
		if (mOffset >= mLevel)
			ret = EOF;
		else
		{
			ret = Math.min(mLevel - mOffset, len);
			System.arraycopy(mBuffer, mOffset, cbuf, off, ret);
			mOffset += ret;
		}

		return (ret);
	}

	/**
	 * Read characters into an array. This method will block until some input is
	 * available, an I/O error occurs, or the end of the stream is reached.
	 * 
	 * @param cbuf
	 *            Destination buffer.
	 * @return The number of characters read, or {@link #EOF EOF} if the end of
	 *         the stream has been reached.
	 * @exception IOException
	 *                If an I/O error occurs.
	 */
	public int read(char[] cbuf) throws IOException
	{
		return (read(cbuf, 0, cbuf.length));
	}

	/**
	 * Reset the source. Repositions the read point to begin at zero.
	 * 
	 * @exception IllegalStateException
	 *                If the source has been closed.
	 */
	public void reset() throws IllegalStateException
	{
		if (null == mStream) throw new IllegalStateException("source is closed");
		if (-1 != mMark)
			mOffset = mMark;
		else
			mOffset = 0;
	}

	/**
	 * Tell whether this source supports the mark() operation.
	 * 
	 * @return <code>true</code>.
	 */
	public boolean markSupported()
	{
		return (true);
	}

	/**
	 * Mark the present position in the source. Subsequent calls to
	 * {@link #reset()} will attempt to reposition the source to this point.
	 * 
	 * @param readAheadLimit
	 *            <em>Not used.</em>
	 * @exception IOException
	 *                If the source is closed.
	 * 
	 */
	public void mark(int readAheadLimit) throws IOException
	{
		if (null == mStream) throw new IOException("source is closed");
		mMark = mOffset;
	}

	/**
	 * Tell whether this source is ready to be read.
	 * 
	 * @return <code>true</code> if the next read() is guaranteed not to block
	 *         for input, <code>false</code> otherwise. Note that returning
	 *         false does not guarantee that the next read will block.
	 * @exception IOException
	 *                If the source is closed.
	 */
	public boolean ready() throws IOException
	{
		if (null == mStream) throw new IOException("source is closed");
		return (mOffset < mLevel);
	}

	/**
	 * Skip characters. This method will block until some characters are
	 * available, an I/O error occurs, or the end of the stream is reached.
	 * <em>Note: n is treated as an int</em>
	 * 
	 * @param n
	 *            The number of characters to skip.
	 * @return The number of characters actually skipped
	 * @exception IllegalArgumentException
	 *                If <code>n</code> is negative.
	 * @exception IOException
	 *                If an I/O error occurs.
	 */
	public long skip(long n) throws IOException, IllegalArgumentException
	{
		long ret;

		if (null == mStream) throw new IOException("source is closed");
		if (0 > n)
			throw new IllegalArgumentException("cannot skip backwards");
		else
		{
			if (mLevel - mOffset < n) fill((int) (n - (mLevel - mOffset))); // minimum
																			// to
																			// satisfy
																			// this
																			// request
			if (mOffset >= mLevel)
				ret = EOF;
			else
			{
				ret = Math.min(mLevel - mOffset, n);
				mOffset += ret;
			}
		}

		return (ret);
	}

	//
	// Methods not in your Daddy's Reader
	//

	/**
	 * Undo the read of a single character.
	 * 
	 * @exception IOException
	 *                If the source is closed or no characters have been read.
	 */
	public void unread() throws IOException
	{
		if (null == mStream) throw new IOException("source is closed");
		if (0 < mOffset)
			mOffset--;
		else
			throw new IOException("can't unread no characters");
	}

	/**
	 * Retrieve a character again.
	 * 
	 * @param offset
	 *            The offset of the character.
	 * @return The character at <code>offset</code>.
	 * @exception IOException
	 *                If the offset is beyond {@link #offset()} or the source is
	 *                closed.
	 */
	public char getCharacter(int offset) throws IOException
	{
		char ret;

		if (null == mStream) throw new IOException("source is closed");
		if (offset >= mBuffer.length)
			throw new IOException("illegal read ahead");
		else
			ret = mBuffer[offset];

		return (ret);
	}

	/**
	 * Retrieve characters again.
	 * 
	 * @param array
	 *            The array of characters.
	 * @param offset
	 *            The starting position in the array where characters are to be
	 *            placed.
	 * @param start
	 *            The starting position, zero based.
	 * @param end
	 *            The ending position (exclusive, i.e. the character at the
	 *            ending position is not included), zero based.
	 * @exception IOException
	 *                If the start or end is beyond {@link #offset()} or the
	 *                source is closed.
	 */
	public void getCharacters(char[] array, int offset, int start, int end) throws IOException
	{
		if (null == mStream) throw new IOException("source is closed");
		System.arraycopy(mBuffer, start, array, offset, end - start);
	}

	/**
	 * Retrieve a string.
	 * 
	 * @param offset
	 *            The offset of the first character.
	 * @param length
	 *            The number of characters to retrieve.
	 * @return A string containing the <code>length</code> characters at
	 *         <code>offset</code>.
	 * @exception IOException
	 *                If the offset or (offset + length) is beyond
	 *                {@link #offset()} or the source is closed.
	 */
	public String getString(int offset, int length) throws IOException
	{
		String ret;

		if (null == mStream) throw new IOException("source is closed");
		if (offset + length > mBuffer.length)
			throw new IOException("illegal read ahead");
		else
			ret = new String(mBuffer, offset, length);

		return (ret);
	}

	/**
	 * Append characters already read into a <code>StringBuffer</code>.
	 * 
	 * @param buffer
	 *            The buffer to append to.
	 * @param offset
	 *            The offset of the first character.
	 * @param length
	 *            The number of characters to retrieve.
	 * @exception IOException
	 *                If the offset or (offset + length) is beyond
	 *                {@link #offset()} or the source is closed.
	 */
	public void getCharacters(StringBuffer buffer, int offset, int length) throws IOException
	{
		if (null == mStream) throw new IOException("source is closed");
		buffer.append(mBuffer, offset, length);
	}

	/**
	 * Close the source. Once a source has been closed, further {@link #read()
	 * read}, {@link #ready ready}, {@link #mark mark}, {@link #reset reset},
	 * {@link #skip skip}, {@link #unread unread}, {@link #getCharacter
	 * getCharacter} or {@link #getString getString} invocations will throw an
	 * IOException. Closing a previously-closed source, however, has no effect.
	 * 
	 * @exception IOException
	 *                If an I/O error occurs
	 */
	public void destroy() throws IOException
	{
		mStream = null;
		if (null != mReader) mReader.close();
		mReader = null;
		mBuffer = null;
		mLevel = 0;
		mOffset = 0;
		mMark = -1;
	}

	/**
	 * Get the position (in characters).
	 * 
	 * @return The number of characters that have already been read, or
	 *         {@link #EOF EOF} if the source is closed.
	 */
	public int offset()
	{
		int ret;

		if (null == mStream)
			ret = EOF;
		else
			ret = mOffset;

		return (ret);
	}

	/**
	 * Get the number of available characters.
	 * 
	 * @return The number of characters that can be read without blocking or
	 *         zero if the source is closed.
	 */
	public int available()
	{
		int ret;

		if (null == mStream)
			ret = 0;
		else
			ret = mLevel - mOffset;

		return (ret);
	}
}
